/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_StructureSummary.csv (downloaded 5/22/2018)

Outputs: 	clean_summary.dta

*******************************************************************************/

clear all

import delimited using "${data_raw}PDB/pdb_StructureSummary.csv", clear
isid structureid
	
* Rename variables
rename structureid structureId
rename structuretitle structureTitle
rename experimentaltechnique experimentalTechnique
rename ndbid ndbId
rename revisiondate revisionDate
rename authors structureAuthor
rename structuremolecularweight structureMolecularWt
rename macromoleculetype macroMoleculeType
rename residuecount residueCount
rename atomsitecount atomSiteCount
rename pdbdoi pdbDoi
rename releasedate releaseDate
rename depositiondate depositionDate
 
* Split multiple revision dates
split revisionDate, parse("#")
drop revisionDate

* Turn date strings into date numbers	
rename releaseDate releaseDate_s
rename depositionDate depositionDate_s
	
forval n = 1/8 {
	rename revisionDate`n' revisionDate_s`n'
}
	
gen releaseDate = date(releaseDate_s, "YMD")
gen depositionDate = date(depositionDate_s, "YMD")

forval n = 1/8 {
	gen revisionDate`n' = date(revisionDate_s`n', "YMD")
}

drop releaseDate_s depositionDate_s revisionDate_s*
format releaseDate depositionDate revisionDate* %td
	
* Parse structure authors
split structureAuthor, parse("#")   // there can be up to 88 authors!
forval n = 1/88 {
	replace structureAuthor`n' = subinstr(structureAuthor`n',".","",.)   
	replace structureAuthor`n' = strtrim(structureAuthor`n')
}
	
gen firstStructureAuthor = structureAuthor1
gen lastStructureAuthor = structureAuthor1

forval n = 2/88 {
	replace lastStructureAuthor = structureAuthor`n' if structureAuthor`n' != ""
}
	
* Count number of authors
gen numStructureAuthors = 1
forval n = 2/88 {
	replace numStructureAuthors = `n' if structureAuthor`n' != ""
}

* Clean last structure author and create an author ID

	replace lastStructureAuthor = upper(lastStructureAuthor)

	* Clean up miscellaneous non-alpha characters (mostly just punctuation)
	split lastStructureAuthor, p(,)
	
	replace lastStructureAuthor2 = subinstr(lastStructureAuthor2, "-", "",.)
	replace lastStructureAuthor2 = trim(lastStructureAuthor2)
	replace lastStructureAuthor1 = subinstr(lastStructureAuthor1, " ","-",.)
	
	* Make decisions about how to treat second initials
	gen first_init = substr(lastStructureAuthor2,1,1)
	gen second_init = substr(lastStructureAuthor2,2,1)

	* Generate a standardized name string	
	gen lastAuthorId = "nameId:" + lastStructureAuthor1 + ":" + first_init
	format lastAuthorId %25s
	
save "${data_clean}clean_summary.dta", replace
